*******************************************************
*This file cleans institutions controls from the IPEDS*
*******************************************************

use "$clean_data_education/institutionDetails.dta", clear
merge 1:1 unitid yearGroup  using "$clean_data_education/tuitionFees.dta", keep(1 3) nogenerate
merge 1:1 unitid yearGroup  using "$clean_data_education/financialAid.dta", keep(1 3) keepusing(anyaidp avgAidAmount) nogenerate
	
*** control
qui bysort unitid: gen dup = cond(_N==1,0,_n)
	drop if dup == 0 & (control == -3 | control == 0)
replace control =. if control < 1 // missing recoded or not answered
sort unitid year  // try to get info from othe years when missing
by unitid: replace control = control[_n-1] if control ==.
gsort unitid -year
by unitid: replace control = control[_n-1] if control ==.
	label variable control "public control, private no-profit, private for profit"

gen public = 0
	replace public = 1 if control == 1
	label variable public "public control"
	label define public 0 "private (profit/no-profit)" 1 "public"

gen forProfit = 0
	replace forProfit = 1 if control == 3
	label variable forProfit "for-profit control"
	label define forProfit 0 "public or non-profit" 1"private for-profit" 

gen nonProfit = 0
	replace nonProfit = 1 if control == 2
	label variable nonProfit "non-profit control"
	label define nonProfit 0 "public or non-profit" 1"private for-profit" 

*** sector
replace sector = . if sector == 99 // missing recoded or not answered
sort unitid year  // try to get info from othe years when missing
by unitid: replace sector = sector[_n-1] if sector ==.
gsort unitid -year
by unitid: replace sector = sector[_n-1] if sector ==.
drop if sector == 0 // drop administrative unit only

gen communityCollege = 0 
	replace communityCollege = 1 if sector >= 4
	label variable communityCollege "community college, 2 years or less"
	label define communityCollege 0"university" 1"community College"
	label value communityCollege communityCollege

*top uni
* source: https://www.topuniversities.com/where-to-study/north-america/united-states/ranked-top-100-us-universities
* Top20
gen uniTop20 = 0
	replace uniTop20 = 1 if unitid == 166027 // harvard
	replace uniTop20 = 1 if unitid == 243744 // stanford
	replace uniTop20 = 1 if unitid == 166683 // MIT
	replace uniTop20 = 1 if unitid == 110635 // berkeley
	replace uniTop20 = 1 if unitid == 190150 // columbia
	replace uniTop20 = 1 if unitid == 110662 // UCLA
	replace uniTop20 = 1 if unitid == 130794 // yale 
	replace uniTop20 = 1 if unitid == 215062 // pennsylvania
	replace uniTop20 = 1 if unitid == 186131 // princeton
	replace uniTop20 = 1 if unitid == 190415 // cornell
	replace uniTop20 = 1 if unitid == 193900 // NYU
	replace uniTop20 = 1 if unitid == 144050 // chicago
	replace uniTop20 = 1 if unitid == 198419 // duke
	replace uniTop20 = 1 if unitid == 162928 // Johns Hopkins 
	replace uniTop20 = 1 if unitid == 123961 // uni Southern California
	replace uniTop20 = 1 if unitid == 147767 // northwesten
	replace uniTop20 = 1 if unitid == 211440 // carnegie mellon
	replace uniTop20 = 1 if unitid == 170976 // uni of michigan
	replace uniTop20 = 1 if unitid == 110404 // caltech
	replace uniTop20 = 1 if unitid == 217156 // brown
	label variable uniTop20 "top 20 ranking univeristies"

* Top30
gen uniTop30 = 0
	replace uniTop30 = 1 if uniTop20 == 1
	replace uniTop30 = 1 if unitid == 164988 // boston university
	replace uniTop30 = 1 if unitid == 227757 // rice university
	replace uniTop30 = 1 if unitid == 131496 // georgetown 
	replace uniTop30 = 1 if unitid == 236948 | unitid == 377555 | unitid == 377564 // uni of wash.
	replace uniTop30 = 1 if unitid == 228778 // uni of texas at austen
	replace uniTop30 = 1 if unitid == 110680 // uni of calif san diego
	replace uniTop30 = 1 if unitid == 139658 // emory uni
	replace uniTop30 = 1 if unitid == 110644 // uni of calif davis
	replace uniTop30 = 1 if unitid == 179867 // washington uni is st louis
	replace uniTop30 = 1 if unitid == 195030 // uni of rochester 
	label variable uniTop30 "top 30 ranking univeristies"

* Top50
gen uniTop50 = 0
	replace uniTop50 = 1 if uniTop20 == 1
	replace uniTop50 = 1 if uniTop30 == 1
	replace uniTop50 = 1 if unitid == 221999 // vanderbilt uni
	replace uniTop50 = 1 if unitid == 139755 // georgia institute of tech
	replace uniTop50 = 1 if unitid == 145637 // university of illinois at urbana
	replace uniTop50 = 1 if unitid == 131469 // george washington uni
	replace uniTop50 = 1 if unitid == 168148 // tufts uni
	replace uniTop50 = 1 if unitid == 134130 // university of florida
	replace uniTop50 = 1 if unitid == 182670 // dartmouth college
	replace uniTop50 = 1 if unitid == 199120 // university of north carolina, chapel hill
	replace uniTop50 = 1 if unitid == 135726 // university of miami
	replace uniTop50 = 1 if unitid == 152080 // university of notre dame
	replace uniTop50 = 1 if unitid == 186380 | unitid == 186399 | unitid == 186371 // rutgers uni
	replace uniTop50 = 1 if unitid == 110653 // university of california irvine
	replace uniTop50 = 1 if unitid == 201645 // case western reserve 
	replace uniTop50 = 1 if unitid == 145600 // university of illinois chicago
	replace uniTop50 = 1 if unitid == 196097 // stony brook university
	replace uniTop50 = 1 if unitid == 196130 // university at buffalo suny
	replace uniTop50 = 1 if unitid == 214777 // Pennsylvania State University main campus
	replace uniTop50 = 1 if unitid == 164924 // boston college
	replace uniTop50 = 1 if unitid == 163204 // university of maryland
	replace uniTop50 = 1 if unitid == 234076 // university of virginia
	label variable uniTop50 "top 50 ranking univeristies"


* financial aid
egen anyAid = mean(anyaidp) if forProfit == 0, by(unitid) // since this variable is only available from 2000, i take an average
	label variable anyAid "% of students with any financial aid (avg2000-2014)"
egen avgAmountAid = mean(avgAidAmount) if forProfit == 0, by(unitid) // since this variable is only available from 2000, i take an average
	label variable avgAmountAid "avg amount of financial aid (avg2000-2014)"
	drop anyaidp avgAidAmount 


* unmutable informations... keep in only the first year  
sort unitid year 
qui bysort unitid: gen dup2 = cond(_N==1,0,_n)
drop if dup2 > 1
drop dup dup2

rename yearGroup firstYear
	label variable firstYear "first year in the inst. details dataset"

save "$clean_data_education/temp.dta", replace 
********************************************************************
* weight: number of students
clear
use "$clean_data_education/fallEnrollmentTotalsReshaped.dta"
keep unitid year total1 total4
reshape wide total1 total4, i(unitid) j(year)
foreach i in 1990 1994 2000 2007 2014 {
	rename total1`i' students`i'
	rename total4`i' freshmenFullTime`i'
	label variable students`i' "number of students in `i'"
}


egen studentsAvg = rowmean(students1994 students2000 students2007)
	label variable studentsAvg "average # of students over 93-07"

egen freshmenFullTimeAvg = rowmean(freshmenFullTime1994 freshmenFullTime2000 freshmenFullTime2007)
	replace freshmenFullTimeAvg = studentsAvg if !mi(studentsAvg) & (freshmenFullTimeAvg == 0 | mi(freshmenFullTimeAvg))
	label variable freshmenFullTimeAvg "average # of full-time freshmen 93-07"

*Largest 150 universities
gsort -studentsAvg
gen id = _n
gen largest150 = (_n<=150)
gen largest100 = (_n<=100)
gen largest50 = (_n<=50)
	label variable largest150 "the largest 150 universities"
	drop id

merge 1:m unitid using "$clean_data_education/temp.dta", keep(2 3) nogenerate // 2 and 3 since 1098 are from 1991 but have no institution details while the rest is non-mainland 

keep unitid studentsAvg students???? instnm control sector avgCost freshmenFullTimeAvg ///
public communityCollege uniTop* largest* forProfit nonProfit firstYear anyAid avgAmountAid iclevel

save "$clean_data_education/temp2.dta", replace 
********************************************************************

* weight: number of degrees
use "$clean_data_education/completionsAwardedTotalsReshaped.dta", clear
keep unitid year total42 total40 total32 total31 total30 total22 total21 ///
total20 total12 total11 total10

egen awards = rsum (total42 total40 total32 total31 total30 total22 total21 ///
total20 total12 total11 total10)
keep unitid yearGroup awards

reshape wide awards, i(unitid) j(year)
foreach i in 1990 1994 2000 2007 2014 {
label variable awards`i' "number of students awarded in `i'"
}


merge 1:m unitid using "$clean_data_education/temp2.dta", keep(2 3) nogenerate // 2 and 3 since 1098 are from 1991 but have no institution details while the rest is non-mainland 

******** CZ variables 
gen forProfitEnr = students1994
	replace forProfitEnr = 0 if forProfit == 0
gen nonProfitEnr = students1994
	replace nonProfitEnr = 0 if nonProfit == 0
gen publicEnr = students1994
	replace publicEnr = 0 if public == 0

foreach i in 1990 1994 2000 2007 2014 { 
gen students4Y`i' = 0
	replace students4Y`i' = students`i' if (sector >= 1 & sector <= 3) 
gen students2Y`i' = 0
	replace students2Y`i' = students`i' if (sector >= 4 & sector <= 6) 
gen studentsL2Y`i' = 0
	replace studentsL2Y`i' = students`i' if (sector >= 7 & sector <= 9) 
gen awards2Y`i' = 0
	replace awards2Y`i' = awards`i' if (sector >= 4 & sector <= 6) 
}


save "$clean_data_education/institutionControls.dta", replace 

drop if forProfit == 1

**** Number of uni 
gen numberOfUni = 1 // this will summ up once you do the collapse at czone level
	label variable numberOfUni "number of uni "

* add locations
merge 1:m unitid using "$clean_data_education/locations.dta", keep(3) nogenerate
	drop if fips == 2 | fips == 15 | (fips >= 60&fips<=78) // dropping Alaska, Puerto Rico...etc


	keep if yearGroup == 1994
**********************************************************
* Run it all together, save labels and collapse
foreach v of var * {
	local l`v' : variable label `v'
	if `"`l`v''"' == "" {
	local l`v' "`v'"
	}
}
order unitid czone instnm
collapse (mean) avgCost anyAid avgAmountAid (sum) awards* students* freshmenFullTimeAvg largest* public* forP* nonP* uniTop?? numberOfUni communityCollege [iw=ourRatio], by(czone)

foreach v of var * {
label var `v' "`l`v''"
}


* share of uni by control
egen private = rsum(forProfit nonProfit), missing
gen shareOfPublic = public/numberOfUni
gen shareOfForProfit = forProfit/numberOfUni
gen shareOfNonProfit = nonProfit/numberOfUni
gen shareOfPrivate = private/numberOfUni
gen shareOfCommunityC = communityCollege/numberOfUni

gen shForProfitEnr = forProfitEnr/students1994
gen shaProfitEnr = nonProfitEnr/students1994
gen shPublicEnr = publicEnr/students1994
gen shareOf2YEnr = students2Y1990/students1990
gen shareOf2YAwr = awards2Y1990/awards1990

gen uniTop20Dummy = 0
	replace uniTop20Dummy = 1 if uniTop20 > 0
gen uniTop30Dummy = 0
	replace uniTop30Dummy = 1 if uniTop30 > 0
gen uniTop50Dummy = 0
	replace uniTop50Dummy = 1 if uniTop50 > 0
gen largest150Dummy = 0
	replace largest150Dummy = 1 if largest150 > 0
gen largest100Dummy = 0
	replace largest100Dummy = 1 if largest100 > 0
gen largest50Dummy = 0
	replace largest50Dummy = 1 if largest50 > 0

save "$final_data_covariates/IPEDS_institutionControlsCZ.dta", replace 
rm "$clean_data_education/temp.dta"
rm "$clean_data_education/temp2.dta"

